Lab_5_Cooper.qmd

Author

Dan Cooper

library(tidyverse)
library(knitr)
library(janitor)
library(DT)

Kable

iris_setosa <- iris |> 
filter(Species == "setosa") |> 
filter(Sepal.Length > 5)
library(knitr)
kable(iris_setosa)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
5.1 3.5 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
5.4 3.7 1.5 0.2 setosa
5.8 4.0 1.2 0.2 setosa
5.7 4.4 1.5 0.4 setosa
5.4 3.9 1.3 0.4 setosa
5.1 3.5 1.4 0.3 setosa
5.7 3.8 1.7 0.3 setosa
5.1 3.8 1.5 0.3 setosa
5.4 3.4 1.7 0.2 setosa
5.1 3.7 1.5 0.4 setosa
5.1 3.3 1.7 0.5 setosa
5.2 3.5 1.5 0.2 setosa
5.2 3.4 1.4 0.2 setosa
5.4 3.4 1.5 0.4 setosa
5.2 4.1 1.5 0.1 setosa
5.5 4.2 1.4 0.2 setosa
5.5 3.5 1.3 0.2 setosa
5.1 3.4 1.5 0.2 setosa
5.1 3.8 1.9 0.4 setosa
5.1 3.8 1.6 0.2 setosa
5.3 3.7 1.5 0.2 setosa

DT

library(DT)
datatable(iris_setosa)

Bound and Present by Datatable

datatable(
  iris |> 
    filter(Species == "setosa") |> 
    filter(Sepal.Length > 5)
)

Create New Object

iris_setosa <- iris |> 
  filter(Species == "setosa") |> 
  filter(Sepal.Length > 5)

datatable(iris_setosa)

Neon Data Table

NEON_MAGs <- read_tsv("/work/pi_bio678_umass_edu/data_NEON/exported_img_bins_Gs0166454_NEON.tsv")
head(NEON_MAGs)
# A tibble: 6 × 21
  bin_oid     `Bin ID` `Genome Name` `IMG Genome ID` `Bin Quality` `Bin Lineage`
  <chr>       <chr>    <chr>                   <dbl> <chr>         <chr>        
1 3300075492… 3300075… Soil microbi…      3300075492 MQ            Bacteria     
2 3300075492… 3300075… Soil microbi…      3300075492 MQ            Bacteria     
3 3300075492… 3300075… Soil microbi…      3300075492 MQ            <NA>         
4 3300075492… 3300075… Soil microbi…      3300075492 MQ            Bacteria; Ac…
5 3300075492… 3300075… Soil microbi…      3300075492 MQ            Bacteria; Ac…
6 3300075492… 3300075… Soil microbi…      3300075492 MQ            Bacteria; Ac…
# ℹ 15 more variables: `GTDB Taxonomy Lineage` <chr>, `Bin Methods` <chr>,
#   `Created By` <chr>, `Date Added` <date>, `Bin Completeness` <dbl>,
#   `Bin Contamination` <dbl>, `Average Coverage` <lgl>,
#   `Total Number of Bases` <dbl>, `5s rRNA` <dbl>, `16s rRNA` <dbl>,
#   `23s rRNA` <dbl>, `tRNA Genes` <dbl>, `Gene Count` <dbl>,
#   `Scaffold Count` <dbl>, `GOLD Study ID` <chr>
str(NEON_MAGs)
spc_tbl_ [16,669 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ bin_oid              : chr [1:16669] "3300075492_s0" "3300075492_s1" "3300075492_s100" "3300075492_s106" ...
 $ Bin ID               : chr [1:16669] "3300075492_s0" "3300075492_s1" "3300075492_s100" "3300075492_s106" ...
 $ Genome Name          : chr [1:16669] "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ "Soil microbial communities from University of Notre Dame Environmental Research Center NEON Field Site, Michiga"| __truncated__ ...
 $ IMG Genome ID        : num [1:16669] 3.3e+09 3.3e+09 3.3e+09 3.3e+09 3.3e+09 ...
 $ Bin Quality          : chr [1:16669] "MQ" "MQ" "MQ" "MQ" ...
 $ Bin Lineage          : chr [1:16669] "Bacteria" "Bacteria" NA "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales" ...
 $ GTDB Taxonomy Lineage: chr [1:16669] "Bacteria; Acidobacteriota; Terriglobia; Acidoferrales; UBA7541; Acidoferrum" "Bacteria; Desulfobacterota_B; Binatia; Binatales; Binataceae; Binatus; Binatus soli" "Archaea; Thermoplasmatota; Thermoplasmata; UBA184; UBA184; UBA184" "Bacteria; Actinomycetota; Thermoleophilia; Solirubrobacterales; Solirubrobacteraceae; Palsa-744" ...
 $ Bin Methods          : chr [1:16669] "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" "SemiBin2:v2.1.0, CheckM2:v1.0.2, GTDB-Tk:v2.4.0, GTDB-Tk-database:release220" ...
 $ Created By           : chr [1:16669] "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" "IMG_PIPELINE" ...
 $ Date Added           : Date[1:16669], format: "2025-01-23" "2025-01-23" ...
 $ Bin Completeness     : num [1:16669] 95.9 99.6 57.9 74 67.3 ...
 $ Bin Contamination    : num [1:16669] 6.31 0 2.39 8.19 3.95 6.54 5.24 0.63 0.61 4.56 ...
 $ Average Coverage     : logi [1:16669] NA NA NA NA NA NA ...
 $ Total Number of Bases: num [1:16669] 5600425 3706224 1233791 2084993 3809196 ...
 $ 5s rRNA              : num [1:16669] 1 1 1 1 0 0 0 1 1 0 ...
 $ 16s rRNA             : num [1:16669] 2 0 0 1 0 0 0 0 0 0 ...
 $ 23s rRNA             : num [1:16669] 2 1 1 1 0 0 0 1 0 0 ...
 $ tRNA Genes           : num [1:16669] 57 51 30 33 31 33 29 25 30 30 ...
 $ Gene Count           : num [1:16669] 4976 3617 1344 2284 3848 ...
 $ Scaffold Count       : num [1:16669] 26 45 210 322 398 85 527 435 243 409 ...
 $ GOLD Study ID        : chr [1:16669] "Gs0166454" "Gs0166454" "Gs0166454" "Gs0166454" ...
 - attr(*, "spec")=
  .. cols(
  ..   bin_oid = col_character(),
  ..   `Bin ID` = col_character(),
  ..   `Genome Name` = col_character(),
  ..   `IMG Genome ID` = col_double(),
  ..   `Bin Quality` = col_character(),
  ..   `Bin Lineage` = col_character(),
  ..   `GTDB Taxonomy Lineage` = col_character(),
  ..   `Bin Methods` = col_character(),
  ..   `Created By` = col_character(),
  ..   `Date Added` = col_date(format = ""),
  ..   `Bin Completeness` = col_double(),
  ..   `Bin Contamination` = col_double(),
  ..   `Average Coverage` = col_logical(),
  ..   `Total Number of Bases` = col_double(),
  ..   `5s rRNA` = col_double(),
  ..   `16s rRNA` = col_double(),
  ..   `23s rRNA` = col_double(),
  ..   `tRNA Genes` = col_double(),
  ..   `Gene Count` = col_double(),
  ..   `Scaffold Count` = col_double(),
  ..   `GOLD Study ID` = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
NEON_MAGs <- NEON_MAGs |> janitor::clean_names()
NEON_MAGs |> 
  count(bin_quality, sort = TRUE) 
# A tibble: 2 × 2
  bin_quality     n
  <chr>       <int>
1 MQ          15369
2 HQ           1300
kable(
  NEON_MAGs |> 
   count(bin_quality) 
)
bin_quality n
HQ 1300
MQ 15369
datatable(
  NEON_MAGs|> 
    filter(bin_quality == "HQ")
)
kable(
NEON_MAGs |> 
  select(c(gtdb_taxonomy_lineage, total_number_of_bases)) |> 
  filter(total_number_of_bases > 10000000)
)
gtdb_taxonomy_lineage total_number_of_bases
Bacteria; Pseudomonadota; Gammaproteobacteria; Steroidobacterales; Steroidobacteraceae; 13-2-20CM-66-19 10115899
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 11932805
Bacteria; Actinomycetota; Actinomycetes; Streptomycetales; Catenulisporaceae; Catenulispora 12420050
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAJPJC01 13151516
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 12046820
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAQGHR01 12217509
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 12942605
Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Pseudonocardiaceae; Actinophytocola 11293845
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 14411455
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 10890912
Bacteria; Planctomycetota; Planctomycetia; Pirellulales; JAICIG01; JAICLL01 11379784
Bacteria; Acidobacteriota; Terriglobia; Bryobacterales; Bryobacteraceae; PALSA-243 10475466
NA 10248256
Bacteria; Pseudomonadota; Alphaproteobacteria; Rhizobiales; Xanthobacteraceae 10436042
Bacteria; Planctomycetota; Planctomycetia; Gemmatales; Gemmataceae 13087515
Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Pseudonocardiaceae; Actinophytocola 10431101
Bacteria; Myxococcota; Polyangia; Polyangiales; JAFGIB01 10030262
Bacteria; Myxococcota; Polyangia; Polyangiales; Polyangiaceae; JANYGI01 12560711
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 11026377
Bacteria; Planctomycetota; Planctomycetia; Gemmatales; Gemmataceae 12479517
Bacteria; Acidobacteriota; Terriglobia; Bryobacterales; Bryobacteraceae; Solibacter 10617884
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae; JAQGHR01 10431455
Bacteria; Planctomycetota; Planctomycetia; Isosphaerales; Isosphaeraceae 12864765
Bacteria; Actinomycetota; Actinomycetes; Mycobacteriales; Mycobacteriaceae; Mycobacterium 10583130
datatable(
NEON_MAGs |> 
  filter(str_detect(gtdb_taxonomy_lineage, 'Bacteroidota'))
)
datatable(
NEON_MAGs |> 
  filter(str_detect(genome_name, 'Yellowstone NP'))
)
NEON_MAGs_tax <- NEON_MAGs |> 
  separate(gtdb_taxonomy_lineage, c("domain", "phylum", "class", "order", "family", "genus"), "; ", remove = FALSE) 
datatable(
  NEON_MAGs_tax |> 
    count(phylum, sort = TRUE)
)
NEON_MAGs_tax_sample <- NEON_MAGs_tax |> 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("genome_name", str_replace, "Terrestrial soil microbial communities from ", "") |> 
  # Use the first `-` to split the column in two
  separate(genome_name, c("site","sample_name"), " - ") |> 
  # Get rid of the the common string "S-comp-1"
  mutate_at("sample_name", str_replace, "-comp-1", "") |>
  # separate the Sample Name into Site ID and plot info
  separate(sample_name, c("site_ID","subplot.layer.date"), "_", remove = FALSE,) |> 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("subplot", "layer", "date"), "-") 
datatable(
  NEON_MAGs_tax_sample |> 
    count(site, sort = TRUE)
)

Exercises

Exercise 1

view(iris)
iris_subset <- iris[iris$Species == "virginica", ]
DT::datatable(iris_subset)

Exercise 2

DT::datatable(NEON_MAGs[NEON_MAGs$`x16s_r_rna` >= 1, ])

Exercise 3

DT::datatable(
  NEON_MAGs[NEON_MAGs$gold_study_id == "Lower Tombigbee", 
            c("genome_name", "gtdb_taxonomy_lineage", "total_number_of_bases")]
)

Exercise 4

DT::datatable(
  NEON_MAGs[NEON_MAGs$gold_study_id == "Gs0166454", ] %>%
    count(bin_lineage, name = "class_count")
)

Exercise 5

DT::datatable(
  NEON_MAGs[grepl("Actinomycetota", NEON_MAGs$gtdb_taxonomy_lineage), ] %>%
    count(gold_study_id, name = "Actinomycetota_count")
)